package com.chenlb.mmseg4j;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.PushbackReader;
import java.io.Reader;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;

/**
 * Reader 流的分词(有字母,数字等), 析出中文(其实是 CJK)成句子 {@link Sentence} 再对 mmseg 算法分词.<br/>
 * <p>
 * 非线程安全
 *
 * @author chenlb 2009-9-20下午10:41:41
 */
public class MMSeg {
	private PushbackReader reader;
	private Seg seg;

	private StringBuilder sentenceBuffer = new StringBuilder(256);
	private Sentence currentSentence;
	private Queue<Word> wordBuffer;
	private int readIndex = 0;

	public MMSeg(Reader input, Seg seg) {
		this.seg = seg;
		reset(input);
	}

	public void reset(Reader input) {
		this.reader = new PushbackReader(new BufferedReader(input), 20);
		this.currentSentence = null;
		this.wordBuffer = new LinkedList<>();
		this.sentenceBuffer.setLength(0);
		this.readIndex = -1;
	}

	private int readNext() throws IOException {
		int d = reader.read();
		if (d > -1) {
			readIndex++;
			// d = Character.toLowerCase(d);
		}
		return d;
	}

	private void pushBack(int data) throws IOException {
		readIndex--;
		reader.unread(data);
	}

	public Word next() throws IOException {
		// 先从缓存中取
		Word word = wordBuffer.poll();
		if (word == null) {
			sentenceBuffer.setLength(0);

			int data = -1;
			boolean read = true;
			while (read && (data = readNext()) != -1) {
				read = false; // 默认一次可以读出同一类字符,就可以分词内容
				int type = Character.getType(data);
				switch (type) {
				case Character.UPPERCASE_LETTER:
				case Character.LOWERCASE_LETTER:
				case Character.TITLECASE_LETTER:
				case Character.MODIFIER_LETTER: {
					/*
					 * 1. 0x410-0x44f -> А-я //俄文 2. 0x391-0x3a9 -> Α-Ω //希腊大写
					 * 3. 0x3b1-0x3c9 -> α-ω //希腊小写
					 */
					data = toAscii(data);
					NationLetter nl = getNation(data);
					if (nl == NationLetter.UNKNOW) {
						read = true;
						break;
					}
					sentenceBuffer.appendCodePoint(data);
					switch (nl) {
					case EN:
						readChars(sentenceBuffer, new ReadCharByAsciiOrDigitOrCJK());
						currentSentence = createSentence(sentenceBuffer);
						// 只有英文的情况
						// readChars(sentenceBuffer, new ReadCharByAscii());
						break;
					case RA:
						readChars(sentenceBuffer, new ReadCharByRussia());
						wordBuffer.add(createWord(sentenceBuffer, Word.TYPE_WORD));
						break;
					case GE:
						readChars(sentenceBuffer, new ReadCharByGreece());
						wordBuffer.add(createWord(sentenceBuffer, Word.TYPE_WORD));
						break;
					default:
						break;
					}
					sentenceBuffer.setLength(0);
					break;
				}
				case Character.OTHER_LETTER: {
					/*
					 * 1. 0x3041-0x30f6 -> ぁ-ヶ //日文(平|片)假名 2. 0x3105-0x3129 ->
					 * ㄅ-ㄩ //注意符号
					 */
					sentenceBuffer.appendCodePoint(data);
					readChars(sentenceBuffer, new ReadCharByAsciiOrDigitOrCJK());
					currentSentence = createSentence(sentenceBuffer);
					sentenceBuffer.setLength(0);
					break;
				}
				case Character.DECIMAL_DIGIT_NUMBER: {
					String wordType = Word.TYPE_DIGIT;
					sentenceBuffer.appendCodePoint(toAscii(data));
					readChars(sentenceBuffer, new ReadCharDigit()); // 读后面的数字,
																	// AsciiLetterOr
					int d = readNext();
					if (d > -1) {
						if (seg.isUnit(d)) {
							// 单位,如时间
							sentenceBuffer.appendCodePoint(d);
							wordBuffer.add(createWord(sentenceBuffer, startIdx(sentenceBuffer), Word.TYPE_DIGIT)); // 先把数字添加(独立)
							sentenceBuffer.setLength(0);
						} else {
							// 后面可能是字母和数字
							pushBack(d);
							if (readChars(sentenceBuffer, new ReadCharByAsciiOrDigit()) > 0) {
								// 如果有字母或数字都会连在一起.
								wordType = Word.TYPE_DIGIT_OR_LETTER;
							}
							wordBuffer.add(createWord(sentenceBuffer, wordType));
							sentenceBuffer.setLength(0); // 缓存的字符清除
						}
					}
					break;
				}
				case Character.LETTER_NUMBER: {
					// ⅠⅡⅢ 单分
					sentenceBuffer.appendCodePoint(data);
					readChars(sentenceBuffer, new ReadCharByType(Character.LETTER_NUMBER));
					int startIdx = startIdx(sentenceBuffer);
					for (int i = 0; i < sentenceBuffer.length(); i++) {
						wordBuffer.add(
								new Word(new char[] { sentenceBuffer.charAt(i) }, startIdx++, Word.TYPE_LETTER_NUMBER));
					}
					// 缓存的字符清除
					sentenceBuffer.setLength(0);
					break;
				}
				case Character.OTHER_NUMBER: {
					// ①⑩㈠㈩⒈⒑⒒⒛⑴⑽⑾⒇ 连着用
					sentenceBuffer.appendCodePoint(data);
					readChars(sentenceBuffer, new ReadCharByType(Character.OTHER_NUMBER));
					wordBuffer.add(createWord(sentenceBuffer, Word.TYPE_OTHER_NUMBER));
					sentenceBuffer.setLength(0);
					break;
				}
				default:
					// 其它认为无效字符
					read = true;
				}
			}

			// 中文分词
			if (currentSentence != null) {
				do {
					Chunk chunk = seg.seg(currentSentence);
					for (int i = 0; i < chunk.getCount(); i++) {
						Word w = chunk.getWords()[i];
						if (w.getString().trim().length() > 0 && !Dictionary.isStopWord(w.getString().trim())) {
							wordBuffer.add(w);
						}
					}
				} while (!currentSentence.isFinish());

				currentSentence = null;
			}
			word = wordBuffer.poll();
		}

		return word;
	}

	public static List<String> pass = Arrays.asList("32,39,47,38,43,45,58,33,95,46,183".split(","));

	/**
	 * 读取下一串指定类型的字符放到 sentenceBuffer 中.
	 *
	 * @param sentenceBuffer
	 * @param readChar
	 *            判断字符的细节.
	 * @return 返回读取的个数
	 * @throws IOException
	 *             {@link #readNext()} 或 {@link #pushBack(int)} 抛出的.
	 */
	private int readChars(StringBuilder sentenceBuffer, ReadChar readChar) throws IOException {
		int num = 0;
		int data = -1;
		while ((data = readNext()) != -1) {
			int d = readChar.transform(data);
			// 新增可读字符:(32>空格)(39>')(47>/)(38>&)(43>+)(45>-)(58>:)(33>!)(95>_)(46>.)(183>·)
			if (readChar.isRead(d) || pass.contains(String.valueOf(d))) {
				sentenceBuffer.appendCodePoint(d);
				num++;
			} else {
				// 不是数字回压,要下一步操作
				pushBack(data);
				break;
			}
		}
		return num;
	}

	/**
	 * 读取下一串指定类型字符.
	 *
	 * @author chenlb 2009-8-15下午09:09:50
	 */
	private static abstract class ReadChar {
		/**
		 * 这个字符是否读取, 不读取也不会读下一个字符.
		 */
		abstract boolean isRead(int codePoint);

		int transform(int codePoint) {
			return codePoint;
		}
	}

	/**
	 * 读取数字
	 */
	private static class ReadCharDigit extends ReadChar {
		boolean isRead(int codePoint) {
			int type = Character.getType(codePoint);
			return isDigit(type);
		}

		int transform(int codePoint) {
			return toAscii(codePoint);
		}
	}

	/**
	 * 读取字母或数字
	 */
	public static class ReadCharByAsciiOrDigit extends ReadCharDigit {
		protected boolean hasDigit = false;

		boolean isRead(int codePoint) {
			boolean isRead = super.isRead(codePoint);
			hasDigit |= isRead;
			return isAsciiLetter(codePoint) || isRead;
		}

		boolean hasDigit() {
			return hasDigit;
		}
	}

	/**
	 * 读取字母
	 */
	static class ReadCharByAscii extends ReadCharDigit {
		boolean isRead(int codePoint) {
			return isAsciiLetter(codePoint);
		}
	}

	/**
	 * 字母或者CJK
	 */
	static class ReadCharByAsciiOrCJK extends ReadCharByAscii {
		public boolean isRead(int codePoint) {
			boolean isRead = super.isRead(codePoint);
			int type = Character.getType(codePoint);
			return isRead || isCJK(type);
		}
	}

	/**
	 * 读取CJK字符
	 */
	static class ReadCharByAsciiOrDigitOrCJK extends ReadCharByAsciiOrDigit {
		public boolean isRead(int codePoint) {
			boolean isRead = super.isRead(codePoint);
			int type = Character.getType(codePoint);
			return isRead || isCJK(type);
		}
	}

	/**
	 * 读取俄语
	 */
	private static class ReadCharByRussia extends ReadCharDigit {
		boolean isRead(int codePoint) {
			return isRussiaLetter(codePoint);
		}
	}

	/**
	 * 读取希腊
	 */
	private static class ReadCharByGreece extends ReadCharDigit {
		boolean isRead(int codePoint) {
			return isGreeceLetter(codePoint);
		}
	}

	/**
	 * 读取指定类型的字符
	 */
	private static class ReadCharByType extends ReadChar {
		int charType;

		public ReadCharByType(int charType) {
			this.charType = charType;
		}

		boolean isRead(int codePoint) {
			int type = Character.getType(codePoint);
			return type == charType;
		}
	}

	private Word createWord(StringBuilder sentenceBuffer, String type) {
		return new Word(toChars(sentenceBuffer), startIdx(sentenceBuffer), type);
	}

	private Word createWord(StringBuilder bufSentence, int startIdx, String type) {
		return new Word(toChars(bufSentence), startIdx, type);
	}

	private Sentence createSentence(StringBuilder sentenceBuffer) {
		return new Sentence(toChars(sentenceBuffer), startIdx(sentenceBuffer));
	}

	/**
	 * 取得 bufSentence 的第一个字符在整个文本中的位置
	 */
	private int startIdx(StringBuilder sentenceBuffer) {
		return readIndex - sentenceBuffer.length() + 1;
	}

	/**
	 * 从 StringBuilder 里复制出 char[]
	 */
	private static char[] toChars(StringBuilder sentenceBuffer) {
		char[] chs = new char[sentenceBuffer.length()];
		sentenceBuffer.getChars(0, sentenceBuffer.length(), chs, 0);
		return chs;
	}

	/**
	 * 双角转单角
	 */
	static int toAscii(int codePoint) {
		if ((codePoint >= 65296 && codePoint <= 65305) // ０-９
				|| (codePoint >= 65313 && codePoint <= 65338) // Ａ-Ｚ
				|| (codePoint >= 65345 && codePoint <= 65370) // ａ-ｚ
		) {
			codePoint -= 65248;
		}
		return codePoint;
	}

	private static boolean isAsciiLetter(int codePoint) {
		return (codePoint >= 'A' && codePoint <= 'Z') || (codePoint >= 'a' && codePoint <= 'z');
	}

	private static boolean isRussiaLetter(int codePoint) {
		return (codePoint >= 'А' && codePoint <= 'я') || codePoint == 'Ё' || codePoint == 'ё';
	}

	private static boolean isGreeceLetter(int codePoint) {
		return (codePoint >= 'Α' && codePoint <= 'Ω') || (codePoint >= 'α' && codePoint <= 'ω');
	}

	/**
	 * EN -> 英语 RA -> 俄语 GE -> 希腊
	 */
	private static enum NationLetter {
		EN, RA, GE, UNKNOW
	}

	;

	private NationLetter getNation(int codePoint) {
		if (isAsciiLetter(codePoint)) {
			return NationLetter.EN;
		}
		if (isRussiaLetter(codePoint)) {
			return NationLetter.RA;
		}
		if (isGreeceLetter(codePoint)) {
			return NationLetter.GE;
		}
		return NationLetter.UNKNOW;
	}

	static boolean isCJK(int type) {
		return type == Character.OTHER_LETTER;
	}

	static boolean isDigit(int type) {
		return type == Character.DECIMAL_DIGIT_NUMBER;
	}

	static boolean isLetter(int type) {
		return type <= Character.MODIFIER_LETTER && type >= Character.UPPERCASE_LETTER;
	}

	static boolean isDot(int type) {
		return type == 46;
	}

	public static boolean isLetterOrDigit(char c) {
		return (c >= 97 && c <= 122) || (c <= 90 && c >= 65) || (c <= 57 && c >= 48);
	}
}
